from IPython.display import Image
from PIL import Image
path="/Users/vats/Downloads/1stscraping.PNG"
display(Image.open(path))
path="/Users/vats/Downloads/2ndscraping.PNG"
display(Image.open(path))
path="/Users/vats/Downloads/scrapyterminal1.JPG"
display(Image.open(path))
path="/Users/vats/Downloads/scrapyterminal2.JPG"
display(Image.open(path))
path="/Users/vats/Downloads/scrapyterminal4.png"
display(Image.open(path))
path="/Users/vats/Downloads/scrapyterminal3.png"
display(Image.open(path))
path="/Users/vats/Downloads/imagescrapingJSON.png"
display(Image.open(path))
#importing packages
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
plt.style.use('fivethirtyeight')
df = pd.read_csv('fulldata.csv')
%pylab inline
import datetime as dt
import statsmodels.api as sm
pd.set_option('display.max_columns', 40)
def getNumbers(str):
array = re.findall(r'[0-9]+', str)
return array
feet=[]
inches= []
for i in range(len(df)) :
list1 = getNumbers(df['Height'][i])
feet.append(list1[0])
inches.append(list1[1])
list1 = []
feet_df = pd.DataFrame(feet)
inches_df = pd.DataFrame(inches)
df['feet'] = feet_df.astype(float)
df['inches'] = inches_df.astype(float)
height_in_cms = []
for x in range (len(df)) :
h = ((df['feet'][x])*30.48 + (df['inches'][x]*2.54))
height_in_cms.append(h)
df['Height in cms']=height_in_cms
weight = []
for i in range(len(df)) :
list2 = getNumbers(df['Weight'][i])
weight.append(list2[0])
list2 = []
df['Weight']=weight
df['Value']=df['Value'].apply(lambda x: x.strip('€'))
df['Wage']=df['Wage'].apply(lambda x: x.strip('€'))
df.Value = (df.Value.replace(r'[KM]+$', '', regex=True).astype(float) * \
df.Value.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1).replace(['K','M'], [10**3, 10**6]).astype(int))
df.Wage = (df.Wage.replace(r'[KM]+$', '', regex=True).astype(float) * \
df.Wage.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1).replace(['K','M'], [10**3, 10**6]).astype(int))
del df['LoanDateEnd']
del df['BestPosition']
del df['ReleaseClause']
del df['IntlReputation']
del df['Growth']
del df['BaseStats']
del df['AttackingWorkRate']
del df['DefensiveWorkRate']
del df['Hits']
df=df[df['Club'].notnull()]
df=df[df['Club']!='111648']
#To check the null values present in columns
df.isnull().sum()
#We check again for null values, Joined can be null as it is not important to be filled up for our analysis
mean=['Volleys','curve','Agility','Balance','Jumping','Interceptions','Positioning','Vision','Composure','SlidingTackle']
for i in mean:
df[i]=df[i].fillna((df[i].mean()))
df.isnull().sum()
#Is there a relation between age and overall rating?
plt.figure(1,figsize=(15,5))
sns.regplot(df['Age'],df['Overall'])
plt.title('Age vs Overall Scatter Plot')
plt.show()
#Is there a relation between height and overall rating?
plt.figure(1,figsize=(10,5))
sns.regplot(df['Height in cms'],df['Overall'])
plt.title('Height in cms vs Overall Scatter Plot')
plt.show()
#The most loyal players
#Finding the number of years the players have played for the same team and then ordering them by the highest value
now = datetime.datetime.now()
df['Year of Joining'] = df.Joined.dropna().map(lambda x: x.split(',')[1].split(' ')[1])
df['Loyalty'] = (df['Year of Joining'].dropna().map(lambda x: now.year - int(x))).astype('int')
Loyalty = df[['Name', 'Club', 'Age','Joined','Loyalty']].sort_values(by = 'Loyalty', ascending = False).head(10)
Loyalty.set_index('Name', inplace=True)
Loyalty
#Clubs with the highest age gap between players
def agegap(group):
return group.max() - group.min()
df.groupby(by=df['Club'])['Age'].agg(agegap).sort_values(ascending=False).head()
#Finding the distribution of players in each position
plt.figure(figsize = (12, 8))
sns.set(style = 'dark', palette = 'deep', color_codes = True)
ax = sns.countplot('Position', data = df, color = 'violet')
ax.set_xlabel(xlabel = 'Different Positions in Soccer', fontsize = 14)
ax.set_ylabel(ylabel = 'Count of Players in each Position', fontsize = 14)
ax.set_title(label = 'Graph of Positions and Players', fontsize = 20)
plt.show()
#Teams paying the most wages
df.groupby(by=df['Club'])['Wage'].agg(sum).sort_values(ascending=False).head()
#Players who earn the most
df.sort_values(by = 'Wage' , ascending = False)[['Name','Club','Country','Overall','Age','Position','Value','Wage']].head()
#The most expensive players
df.sort_values(by = 'Value' , ascending = False)[['Name','Club','Country','Overall','Age','Position','Value','Wage']].head()
np.corrcoef(df['Value'],df['Wage'])
plt.figure(1,figsize=(10,5))
sns.regplot(df['Wage'],df['Value'])
plt.title('Value vs Wage Scatter Plot')
plt.show()
#Highest rated player in each position
df.groupby(by=df['Position'])['Overall'].agg('max')
x=pd.DataFrame()
grouped = df.groupby(by=df['Position'])
position=df['Position'].unique()
for i in position:
x=x.append(grouped.get_group(i).sort_values(by='Overall',ascending=False).head(1),ignore_index=True)
x
# What are the best 10 features for each position and according to that we make our dream team
dream_team={}
features = ('Crossing', 'finishing',
'HeadingAccuracy', 'ShortPass', 'Volleys', 'Dribble',
'curve', 'FreeKickAcc', 'LongPass', 'BallControl',
'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance',
'ShotPower', 'Jumping', 'Stamina', 'Strength',
'LongShots', 'Aggression', 'Interceptions',
'Positioning', 'Vision', 'Penalties', 'Composure',
'Marking', 'StandingTackle', 'SlidingTackle', 'GKdiving',
'GKhandling', 'GKkicking', 'GKpositioning', 'GKreflexes')
for i, val in df.groupby(df['Position'])[features].mean().iterrows():
dream_team[i]=list(val.nlargest(10).index)
print('For {}: {}, {}, {}, {}, {},{},{},{},{},{}'.format(i, *list(val.nlargest(10).index)))
df2=df.copy()
def sum_frame_by_column(frame, new_col_name, list_of_cols_to_sum):
frame[new_col_name] = frame[list_of_cols_to_sum].astype(float).sum(axis=1)
return(frame.sort_values(by=new_col_name,ascending=False).head(1))
best11=pd.DataFrame()
#For GK
df_GK=df[df['Position']=='GK']
pd.set_option('mode.chained_assignment', None)
sum_frame_by_column(df_GK, 'Sum', ['GKreflexes', 'GKdiving', 'GKhandling', 'GKpositioning', 'GKkicking','Strength','Jumping','Reactions','Balance','Composure'])
best11=best11.append(sum_frame_by_column(df_GK, 'Sum', ['GKreflexes', 'GKdiving', 'GKhandling', 'GKpositioning', 'GKkicking','Strength','Jumping','Reactions','Balance','Composure']),ignore_index=True)
best11
#For LB
df_LB=df[df['Position']=='LB']
pd.set_option('mode.chained_assignment', None)
sum_frame_by_column(df_LB, 'Sum', ['Strength', 'Jumping', 'StandingTackle', 'Aggression', 'SlidingTackle','HeadingAccuracy','Marking','Interceptions','Stamina','Reactions'])
best11=best11.append(sum_frame_by_column(df_LB, 'Sum', ['Strength', 'Jumping', 'StandingTackle', 'Aggression', 'SlidingTackle','HeadingAccuracy','Marking','Interceptions','Stamina','Reactions']),ignore_index=True)
best11
#For the 2 CBs
df_CB=df[df['Position']=='CB']
pd.set_option('mode.chained_assignment', None)
def sum_frame_by_column2(frame, new_col_name, list_of_cols_to_sum):
frame[new_col_name] = frame[list_of_cols_to_sum].astype(float).sum(axis=1)
return(frame.sort_values(by=new_col_name,ascending=False).head(2))
sum_frame_by_column2(df_CB, 'Sum', ['Acceleration', 'SprintSpeed', 'Stamina', 'Balance', 'Agility','Jumping','StandingTackle','SlidingTackle','Aggression','Crossing'])
best11=best11.append(sum_frame_by_column2(df_CB, 'Sum', ['Acceleration', 'SprintSpeed', 'Stamina', 'Balance', 'Agility','Jumping','StandingTackle','SlidingTackle','Aggression','Crossing']),ignore_index=True)
best11
#For RB
df_RB=df[df['Position']=='RB']
pd.set_option('mode.chained_assignment', None)
sum_frame_by_column(df_RB, 'Sum', ['Acceleration', 'SprintSpeed', 'Stamina', 'Balance', 'Agility','Jumping','StandingTackle','SlidingTackle','Aggression','Strength'])
best11=best11.append(sum_frame_by_column(df_RB, 'Sum', ['Acceleration', 'SprintSpeed', 'Stamina', 'Balance', 'Agility','Jumping','StandingTackle','SlidingTackle','Aggression','Strength']),ignore_index=True)
best11
#For the 2 CMs
df_CM=df[df['Position']=='CM']
pd.set_option('mode.chained_assignment', None)
def sum_frame_by_column2(frame, new_col_name, list_of_cols_to_sum):
frame[new_col_name] = frame[list_of_cols_to_sum].astype(float).sum(axis=1)
return(frame.sort_values(by=new_col_name,ascending=False).head(2))
sum_frame_by_column2(df_CM, 'Sum', [ 'Balance', 'ShortPass', 'Agility', 'Stamina', 'Acceleration','BallControl','LongPass','SprintSpeed','Dribble','Vision'])
best11=best11.append(sum_frame_by_column2(df_CM, 'Sum', ['Balance', 'ShortPass', 'Agility', 'Stamina', 'Acceleration','BallControl','LongPass','SprintSpeed','Dribble','Vision']),ignore_index=True)
best11
#For LW
df_LW=df[df['Position']=='LW']
pd.set_option('mode.chained_assignment', None)
sum_frame_by_column(df_LW, 'Sum', ['Acceleration', 'SprintSpeed', 'Agility', 'Balance', 'Dribble','BallControl','ShotPower','ShortPass','Positioning','Stamina'])
best11=best11.append(sum_frame_by_column(df_LW, 'Sum', ['Acceleration', 'SprintSpeed', 'Agility', 'Balance', 'Dribble','BallControl','ShotPower','ShortPass','Positioning','Stamina']),ignore_index=True)
best11
#For CAM
df_CAM=df[df['Position']=='CAM']
pd.set_option('mode.chained_assignment', None)
sum_frame_by_column(df_CAM, 'Sum', ['Balance', 'Agility', 'Acceleration', 'SprintSpeed', 'BallControl','Dribble','ShortPass','Vision','ShotPower','Composure'])
best11=best11.append(sum_frame_by_column(df_CAM, 'Sum', ['Balance', 'Agility', 'Acceleration', 'SprintSpeed', 'BallControl','Dribble','ShortPass','Vision','ShotPower','Composure']),ignore_index=True)
best11
#For RW
df_RW=df[df['Position']=='RW']
pd.set_option('mode.chained_assignment', None)
sum_frame_by_column(df_RW, 'Sum', ['Acceleration', 'SprintSpeed', 'Agility', 'Balance', 'Dribble','BallControl','ShotPower','Stamina','Positioning','Crossing'])
best11=best11.append(sum_frame_by_column(df_RW, 'Sum', ['Acceleration', 'SprintSpeed', 'Agility', 'Balance', 'Dribble','BallControl','ShotPower','Stamina','Positioning','Crossing']),ignore_index=True)
best11
#For ST
df_ST=df[df['Position']=='ST']
pd.set_option('mode.chained_assignment', None)
sum_frame_by_column(df_ST, 'Sum', ['SprintSpeed', 'Acceleration', 'Strength', 'Jumping', 'finishing','ShotPower','Agility','Positioning','Balance','BallControl'])
best11=best11.append(sum_frame_by_column(df_ST, 'Sum', ['SprintSpeed', 'Acceleration', 'Strength', 'Jumping', 'finishing','ShotPower','Agility','Positioning','Balance','BallControl']),ignore_index=True)
best11
best11
#This is the dream team line up
path="/Users/vats/Downloads/dreamteam.PNG"
display(Image.open(path))
#Most valuable teams
df.groupby(by=df['Club'])['Value'].agg(sum).sort_values(ascending=False).head()
#Clubs with the highest mean potential
df.groupby(by=df['Club'])['Potential'].agg('mean').sort_values(ascending=False).head()
col=['Potential']
X = df[col]
X = sm.add_constant(X)
y = df['Value']
pd.concat([X, y], axis=1).head(10)
model = sm.OLS(endog=y, exog=X)
results = model.fit()
results.summary()
# different set of packages have to be installed and imported for this map
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True) #initiate Plotly notebook mode
import plotly.graph_objs as go
overall = pd.DataFrame(df.groupby(['Country'])['Overall'].mean().reset_index())
count = pd.DataFrame(overall.groupby('Country')['Overall'].mean().reset_index())
#this count dataframe has countrywise mean overall value (considered overall of every player for a country and then computed mean)
trace = [go.Choropleth(
colorscale = 'YlGnBu',
locationmode = 'country names',
locations = count['Country'],
text = count['Country'],
z = count['Overall'],
)]
layout = go.Layout(title = 'Country vs Mean Overall Rating')
fig = go.Figure(data = trace, layout = layout)
py.iplot(fig)
import geopandas
fp = '/Users/vats/Downloads/World_Countries/World_Countries.shp'
geo_df = geopandas.read_file(fp) # this dataframe is GEOdataframe and not a normal dataframe
geo_df.plot()
# join our dataframe named count defined in analysis point 7 with geodataframe obtained above
merged = geo_df.set_index('COUNTRY').join(count.set_index('Country'))
variable = "Overall" # column to visualize on the choropleth map generated
vmin, vmax = 10000,200000
fig, ax = plt.subplots(1, figsize=(10, 6))
ax.axis("off")
ax.set_title('Overall rating for each country', fontdict={'fontsize': '25', 'fontweight' : '3'})
sm = plt.cm.ScalarMappable(cmap='YlGnBu', norm=plt.Normalize(vmin=vmin, vmax=vmax)) #colorbar that you see on the right as a legend to the map
sm._A = []
cbar = fig.colorbar(sm)
merged.plot(column=variable, cmap="YlGnBu", linewidth=0.8, ax=ax, edgecolor="0.8")